from utils import url_manager
import requests
from bs4 import BeautifulSoup
import re

root_url = "http://www.crazyant.net"

urls = url_manager.UrlManager()
urls.add_new_url(root_url)

fileOut = open("a.txt", "w")

# 整个while为一个大循环，实现了从拿取新的URL，到新增URL的循环，直到URL池为空
while urls.has_new_url():
    current_url = urls.get_url()
    r = requests.get(current_url, timeout=10)
    if r.status_code != 200:
        print("error,return status_code is not 200", current_url)
        continue

    soup = BeautifulSoup(r.text, "html.parser")
    title = soup.title.string
    fileOut.write("%s\t%s\n" % (current_url, title))
    fileOut.flush()
    print("success: %s,%s,%d" % (current_url, title, len(urls.new_urls)))

    # URL管理器：发现新的URL并匹配RE，匹配成功则加入URL池
    links = soup.find_all("h2", class_="entry-title")
    for link in links:
        item = link.find("a")
        print(item)
        href = item["href"]
        if href is None:
            continue

        pattern = r'^http://www.crazyant.net/\d+.html$'
        if re.match(pattern, href):
            urls.add_new_url(href)

fileOut.close()
